Import libraries
source("R/bioage_estimate_median.R")
source("R/gompertz_draw.R")
source("R/weibull_draw.R")
source("R/generate_data.R")
library(tidyverse)
library(MASS)
library(psbcGroup)
Contents “generate_data.R” file
Wrapper
generate_data = function(pop_n, obs_n, p, g, ltname, betafrac, seed, force_recalc, X_plots, a = exp(-9), b = 0.085, X_rho, β_rho, non_zero_betas, X_scale, active_hazard, betasep, bscale) {
print("Generating βs...")
betas = generate_betas(p = p, g=g, rho = β_rho, rho_between = 0,
seed = seed, mu_u = betasep, mu_l = -betasep, beta_scale = bscale,
beta_denom = betafrac, plot = T, non_zero_groups = non_zero_betas, active_hazard = active_hazard)
# betas$beta = betas$beta + 0.01
print("Generating X...")
X = generate_X(n = pop_n, p = p, g = g, rho = X_rho, seed = seed,
rho_between = 0,
X_plots = X_plots, scale = X_scale)
print("Done!")
print("Generating population lifetable...")
lt = generate_population_lifetable(N_pop = pop_n, M = p, betas = betas$beta,
X = X$X, filename = ltname,
seed = seed, force_recalc = force_recalc, a=a, b=b)
print("Done!")
plot = ggplot(lt, aes(x = t, y = mrl)) +
geom_line() +
labs(
title = "Population Lifetable MRL",
x = "Chronological age (years)",
y = "Mean Residual Life"
) +
theme_minimal()
print(plot)
print("Generating data from lifetable...")
df_sim = create_dataset(M = p, n_obs = obs_n, G = g,
gsize = (p/g), lt = lt, betas = betas$beta,
seednr = seed+1, X_plots = F, a=a, b=b, X_rho = X_rho,
X_scale = X_scale, followup = 20)
return(list(df = df_sim, true_betas = betas))
}
Distributions of Linpred
source("R/generate_data.R")
p = 200
g = 4
betas = generate_betas(p = p, g=g, rho = 0.9, rho_between = 0,
seed = 123, mu_u = 2, mu_l = -2, beta_scale = 0.2,
beta_denom = 1, plot = T, non_zero_groups = 0.5, active_hazard = 0.05)
## Mean of betas pre-scaling: 0.172180138647294
## Lower range of beta values pre-scaling: -1.04849838295947
## Upper range of beta values pre-scaling: 1.95110582206376



X = generate_X(n = 1000, p = p, g = g, rho = 0.9, seed = 123,
rho_between = 0,
X_plots = F, scale = 0.1)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
## Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
linpreds = scale(as.matrix(X$X) %*% as.matrix(betas$beta))
linpreds %>% hist(breaks = 20)

Test
source("R/generate_data.R")
nzb = 0.5
bfrac = 40
p = 200
g = 4
bsep = 4
df_sim = generate_data(pop_n = 10000, obs_n = 500, p = p, g = g,
ltname = "19may", betafrac = bfrac, seed = 123,
force_recalc = T, X_plots = T, a = exp(-9), b = 0.085,
X_rho = 0, β_rho = 0.9, non_zero_betas = nzb, X_scale = 1,
#active_hazard = 0.025, betasep = 2)
active_hazard = 0,
#active_hazard = (0*sqrt(p))/nzb/bfrac/bsep,
betasep = bsep, bscale = 0.25)
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.345611685475195
## Lower range of beta values pre-scaling: -1.50079126072838
## Upper range of beta values pre-scaling: 2.81259560682826



## [1] "Generating X..."




## [1] "Done!"
## [1] "Generating population lifetable..."
## Range of death: 0.319790072173935
## Range of death: 150

## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 362 valid cases where age_start < age_death, but 500 requested.
## Nr of valid indices: 362
## Range of linpred values: -1.412591 1.405548
lt = read_rds("pop_lifetable_19may.rds")
true_betas = df_sim$true_betas
df_sim = df_sim$df
Function performance
source("R/generate_data.R")
aft_reg = function(df_sim, p) {
true_betas = df_sim$true_betas
df_sim = df_sim$df
# Train/test split 50/50
train_indices = sample(1:nrow(df_sim), nrow(df_sim)/2)
df_sim_train = df_sim[train_indices,]
df_sim_test = df_sim[-train_indices,]
# Create the formula for AFT regression
predictor_vars = paste(c(glue("x{1:p}")), collapse = " + ")
aft_formula = as.formula(paste("Surv(age_start, age_end, status) ~", predictor_vars))
# Fit the AFT model
fit_aft = tryCatch({
fit_aft = eha::aftreg(
formula = aft_formula,
data = df_sim_train,
dist = "gompertz",
control = list(
maxit = 100,
trace = FALSE
)
)
fit_aft
},
error = function(e) {
message("AFT model error: ", e$message)
NA
})
# If model fitting failed, return NA
if (is.na(fit_aft)[1]) {
return(NA)
}
# get estimated parameters
sigma_est <- exp(fit_aft$coefficients["log(scale)"]) # canonical parametrization
tau_est <- exp(fit_aft$coefficients["log(shape)"]) # canonical parametrization
a_est <- tau_est / sigma_est
b_est <- 1 / sigma_est
betas_est_aft <- fit_aft$coefficients[1:p]
linpred_aft <- rowSums(sweep(df_sim_test[,1:p], 2, betas_est_aft, "*")) # SCALE
for (i in 1:nrow(df_sim_test)){
mrl_uncon <- integrate(gomp_baseline_surv, lower = (df_sim_test$age_start[i] * exp(linpred_aft[i])),
upper=Inf, a = a_est, b = b_est)$value
s_cond <- gomp_baseline_surv(df_sim_test$age_start[i] * exp(linpred_aft[i]), a = a_est, b = b_est)
df_sim_test$pred_mrl[i] <- (mrl_uncon / s_cond) * exp(-linpred_aft[i])
}
p = df_sim_test %>% ggplot(aes(x = mrl, y = pred_mrl)) +
geom_point() +
geom_abline(color = 'red') +
theme_minimal()
print(p)
# Get the bias
aftreg_coef_rmse = sqrt(mean((betas_est_aft - true_betas$betas)^2))
# Get the RMSE
aftreg_rmse = sqrt(mean((df_sim_test$mrl - df_sim_test$pred_mrl)^2))
return(list(prediction = aftreg_rmse, coefficients = aftreg_coef_rmse))
}
psbc_reg = function(df_sim, p, g) {
true_betas = df_sim$true_betas
df_sim = df_sim$df
# Train/test split 50/50
train_indices = sample(1:nrow(df_sim), nrow(df_sim)/2)
df_sim_train = df_sim[train_indices,]
df_sim_test = df_sim[-train_indices,]
Y_train = cbind(df_sim_train$age_start, df_sim_train$age_end, df_sim_train$status)
Y_test = cbind(df_sim_test$age_start, df_sim_test$age_end, df_sim_test$status)
# Create covariate matrix X
X_train = as.matrix(df_sim_train[, paste0("x", 1:p)])
X_test = as.matrix(df_sim_test[, paste0("x", 1:p)])
XC = NULL # CONFOUNDERS
# Create group indicator for each variable
grpInx = rep(1:g, each = p/g)
# Set hyperparameters (PRIOR)?
hyperParams = list(
a.sigSq = 0.01, b.sigSq = 0.01,
mu0 = 0, h0 = 0.01,
v = 0.1
)
# Set starting values https://cran.r-project.org/web/packages/psbcGroup/psbcGroup.pdf
w = log(Y_train[,1])
mu = 0.1
beta = rep(0.1, p)
sigSq = 0.5
tauSq = rep(0.4, p)
lambdaSq = 100
betaC = rep(0.11, 0)
startValues = list(w=w, beta=beta, tauSq=tauSq, mu=mu, sigSq=sigSq,
lambdaSq=lambdaSq, betaC=betaC)
# MCMC parameters format __ CHANGE __
mcmcParams = list(
run = list(
numReps = 100,
thin = 5, # Update params every n steps
burninPerc = 0.2 # Discard the first ... steps
),
tuning = list(
mu.prop.var = 0.1,
sigSq.prop.var = 0.005,
L.beC = 50,
M.beC = 1,
eps.beC = 0.001,
L.be = 100,
M.be = 1,
eps.be = 0.001
)
)
fit_bayes_aft = aftGL_LT(Y_train, X_train, XC, grpInx, hyperParams, startValues, mcmcParams)
# Extract betas using the maximum likelihood via density
get_mode = function(x){
xdist = density(x)
mode = xdist$x[which.max(xdist$y)]
return(mode)
}
betas = fit_bayes_aft$beta.p %>% apply(MARGIN = 2, FUN = get_mode)
# Make linear predictor for MRL
X_test = X_test %>% as.matrix
linpred = X_test %*% betas
# Change to lognormal
# Create baseline lognormal survival
# Implemented in R -> check log scales
# dlnorm, use any of them 1 - F(x)
# instead of gomp_baseline_surv no false, 1-
# a = exp(-9)
# b = 0.085
# 1-plnorm(1:100, meanlog = fit_bayes_aft$mu.p[120], sdlog = fit_bayes_aft$sigSq.p[120])
# Estimate mean from MCMC draws
mu_est = mean(fit_bayes_aft$mu.p)
sigSq_est = mean(fit_bayes_aft$sigSq.p)
for (i in 1:nrow(df_sim_test)){
mrl_uncon = integrate(function(x) 1 - plnorm(x, meanlog = mu_est, sdlog = sqrt(sigSq_est)), # Var to Sd via sqrt
lower = (df_sim_test$age_start[i] * exp(linpred[i])),
upper = Inf)$value
s_cond = 1 - plnorm(df_sim_test$age_start[i] * exp(linpred[i]), meanlog = mu_est, sdlog = sqrt(sigSq_est))
df_sim_test$pred_mrl[i] <- (mrl_uncon / s_cond) * exp(-linpred[i])
}
p = df_sim_test %>% ggplot(aes(x = mrl, y = pred_mrl)) +
geom_point() +
geom_abline(color = 'red') +
theme_minimal()
print(p)
# Get the RMSE
pred_RMSE = sqrt(mean((df_sim_test$mrl - df_sim_test$pred_mrl)^2))
beta_RMSE = NA #sqrt(mean((betas$beta - true_betas$beta)^2))
return(list(prediction = pred_RMSE, coefficients = beta_RMSE))
}
run_experiment = function(pop_n = 20000, obs_n = 1000, p = 20, g = 4, ltname, bfrac, seed, nzb, bsep, X_rho) {
print("<<<<GENERATING SIMULATED DATA>>>>")
df_sim = generate_data(pop_n = pop_n, obs_n = obs_n, p = p, g = g,
ltname = ltname, betafrac = bfrac, seed = 123,
force_recalc = F, X_plots = T, a = exp(-9), b = 0.085,
X_rho = X_rho, β_rho = 0.9, non_zero_betas = nzb, X_scale = 1,
#active_hazard = 0.025, betasep = 2)
active_hazard = (0.05*sqrt(p))/nzb/bfrac/bsep,
betasep = bsep, bscale = 0.25)
set.seed(seed+1)
print("<<<<RUNNING AFT>>>>")
rmse_aft = aft_reg(df_sim = df_sim, p = p)
set.seed(seed+1)
print("<<<<RUNNING PSBC BAYESIAN>>>>")
rmse_psbc = psbc_reg(df_sim = df_sim, p = p, g = g)
return(list(aft = rmse_aft, psbc = rmse_psbc))
}
Experiments
# p = 20
rmses = run_experiment(p = 20, g = 4, ltname = "p20_may20", bfrac = 40, seed = 42, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.215071322236047
## Lower range of beta values pre-scaling: -0.825018359139732
## Upper range of beta values pre-scaling: 1.69028592387452



## [1] "Generating X..."




## [1] "Done!"
## [1] "Generating population lifetable..."
## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 791 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 791
## Range of linpred values: -0.1849455 0.2802615
## [1] "<<<<RUNNING AFT>>>>"

## Warning: Unknown or uninitialised column: `betas`.
## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"

print(rmses)
## $aft
## $aft$prediction
## [1] 4.485843
##
## $aft$coefficients
## [1] NaN
##
##
## $psbc
## $psbc$prediction
## [1] 15.46132
##
## $psbc$coefficients
## [1] NA
rmses = run_experiment(p = 60, g = 4, ltname = "p60_may20", bfrac = 40, seed = 43, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.0701313093887515
## Lower range of beta values pre-scaling: -0.68035102718843
## Upper range of beta values pre-scaling: 1.18615518586129



## [1] "Generating X..."




## [1] "Done!"
## [1] "Generating population lifetable..."
## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 809 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 809
## Range of linpred values: -0.2904009 0.2777132
## [1] "<<<<RUNNING AFT>>>>"

## Warning: Unknown or uninitialised column: `betas`.
## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"

print(rmses)
## $aft
## $aft$prediction
## [1] 10.44082
##
## $aft$coefficients
## [1] NaN
##
##
## $psbc
## $psbc$prediction
## [1] 16.17074
##
## $psbc$coefficients
## [1] NA
rmses = run_experiment(p = 100, g = 4, ltname = "p100_may20", bfrac = 40, seed = 44, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.206315503690729
## Lower range of beta values pre-scaling: -1.06061031001221
## Upper range of beta values pre-scaling: 1.67379955248034



## [1] "Generating X..."




## [1] "Done!"
## [1] "Generating population lifetable..."
## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 795 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 795
## Range of linpred values: -0.5941606 0.6304133
## [1] "<<<<RUNNING AFT>>>>"
## AFT model error: Singular hessian; suspicious variable No. TRUE:
## =
## Try running with fixed shape
## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"

print(rmses)
## $aft
## [1] NA
##
## $psbc
## $psbc$prediction
## [1] 23.53594
##
## $psbc$coefficients
## [1] NA
rmses = run_experiment(p = 200, g = 4, ltname = "p200_may20", bfrac = 40, seed = 45, nzb = 0.5, bsep = 2, X_rho = 0)
## [1] "<<<<GENERATING SIMULATED DATA>>>>"
## [1] "Generating βs..."
## Mean of betas pre-scaling: 0.148329628219689
## Lower range of beta values pre-scaling: -1.13669894797518
## Upper range of beta values pre-scaling: 1.65937506505303



## [1] "Generating X..."




## [1] "Done!"
## [1] "Generating population lifetable..."
## [1] "Done!"

## [1] "Generating data from lifetable..."
## Warning in create_dataset(M = p, n_obs = obs_n, G = g, gsize = (p/g), lt = lt,
## : Only 767 valid cases where age_start < age_death, but 1000 requested.
## Nr of valid indices: 767
## Range of linpred values: -0.8091164 1.047817
## [1] "<<<<RUNNING AFT>>>>"
## AFT model error: Singular hessian; suspicious variable No. TRUE:
## =
## Try running with fixed shape
## [1] "<<<<RUNNING PSBC BAYESIAN>>>>"

print(rmses)
## $aft
## [1] NA
##
## $psbc
## $psbc$prediction
## [1] 26.03662
##
## $psbc$coefficients
## [1] NA
Monte carlo
# Parameters grid:
# p
# g
#
MC = replicate(10000, {
})